In [18]:

    
import pandas as pd
from pymongo import MongoClient
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.dates import YearLocator, DayLocator, MonthLocator, DateFormatter
from nltk.corpus import stopwords as sw
stopwords = sw.words('german')
import numpy as np

Corpus growth



In [13]:

    
date_range = pd.date_range('2016-11-21', datetime.now(), period='1D')
client = MongoClient('mongodb://localhost:27017/')
db = client.articles



In [14]:

    
article_counts = []
for date in date_range:
    num_articles = db.articles.find({'date': {'$lt': date}, 'text': { '$exists': True, "$ne" : "" } }).count()
    article_counts.append((date, num_articles))



In [15]:

    
%matplotlib inline

dates = [x for x,y in article_counts]
counts = [y for x,y in article_counts]

fig, ax = plt.subplots()
ax.plot_date(dates, counts, 'g', color = '0.4')

ax.xaxis.set_major_locator(MonthLocator())
ax.xaxis.set_major_formatter(DateFormatter('%Y-%m'))

ax.fmt_xdata = DateFormatter('%Y-%m')
fig.autofmt_xdate()
#ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=45)

#plot = plt.plot(article_counts)
plt.ylabel('number of articles')
plt.savefig('corpus_size.pdf', format='pdf')
plt.show()

Article Length Histogram



In [ ]:

    
all_articles = db.articles.find()
article_sizes = []
for i, article in enumerate(all_articles):
    print(i)
    article_sizes.append(len(article['text']))



In [21]:

    
%matplotlib inline

def is_outlier(points, thresh=3.5):

    return x > 10000 or x == 0

import pickle

#with open('article_sizes.pickle','wb') as f:
    #pickle.dump(article_sizes, f)
    #article_sizes2 = pickle.load(f)

filtered_sizes = [x for x in article_sizes if not is_outlier(x)]
print(len(filtered_sizes))
print(len(filtered_sizes) / float(len(article_sizes)))

plt.hist(filtered_sizes, bins=20, color='0.75')
plt.ylabel('number of articles')
plt.xlabel('article size')
plt.savefig('article_size.pdf', format='pdf')
plt.show()









    



68760
0.885204114474

Word frequency histogram



In [13]:

    
# load all category corpora
from collections import Counter
category_names = ['Sonstiges', 'Aktuell', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Ausland', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur', 'Ignore']

paths = [(x, "news/corpus/corpus{}.txt".format(x)) for x in category_names]

def load_sets(paths):
    X, y = [], []

    for name, path in paths:
        with open(path) as cur_file:
            for line in cur_file:
                tokens = [x for x in line.split() if x not in stopwords]
                if len(tokens) > 0:
                    X.append(tokens)
                    y.append(name)
    print("loaded {} articles".format(len(X)))
    return X, y

X, y = load_sets(paths)









    



/Users/daniel/Library/Python/2.7/lib/python/site-packages/ipykernel/__main__.py:15: UnicodeWarning: Unicode equal comparison failed to convert both arguments to Unicode - interpreting them as being unequal






    



loaded 49621 articles



In [14]:

    
#build the dict
from IPython.display import clear_output
total = Counter()
tot = len(X)
for i, article in enumerate(X):
    if i%100 == 0:
        clear_output()
        print("{}/{}".format(i, tot))
    total.update(article)









    



49600/49621



In [15]:

    
print(total['und'])



In [16]:

    
total_words = sum(total.values())
unique_words = len(total.values())

print("Total words: {}\nUnique words: {}".format(total_words, unique_words))
frequent_words = {k: v for k,v in total.iteritems() if v >= 5}


sum_frequent_words = sum(frequent_words.values())
num_frequent_words = len(frequent_words.values())
print(sum_frequent_words)
print("{} frequent words".format(num_frequent_words))









    



Total words: 13233740
Unique words: 377777
12804395
108900 frequent words



In [27]:

    
%matplotlib inline
plt.yscale('log', nonposy='clip')
plt.hist(total.values(), bins=np.logspace(0.1, 5, 20), color='0.75')
plt.gca().set_xscale("log")
plt.ylabel('number of words')
plt.xlabel('word frequency')
plt.savefig('word_frequencies.pdf', format='pdf')
plt.show()



In [ ]:



In [ ]: